Appendix

In [1]:
# Helper libraries
import numpy as np
import pandas as pd
from time import time
from collections import Counter

import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import homogeneity_score
In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
In [3]:
training_df = pd.read_csv("/Users/gurjy/Downloads/train.csv")
In [4]:
print(training_df.shape)
(42000, 785)
In [5]:
#save label in different variable
target = training_df['label']
# Drop the label feature
training_df.drop("label",axis=1,inplace=True)
In [6]:
target.shape
Out[6]:
(42000,)
In [56]:
X = training_df.values
#scale features
X_std = StandardScaler().fit_transform(X)

# Calculating Eigenvectors and eigenvalues of Cov matirx
mean_vec = np.mean(X_std, axis=0)
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)

# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance
In [8]:
X = training_df.values
X_std = StandardScaler().fit_transform(X)

mean_vec=np.mean(X_std,axis=0)
cov_mat=np.cov(X_std.T)
eigvalues ,eigvectors =np.linalg.eig(cov_mat)

eigpairs=[(np.abs(eigvalues[i]),eigvectors[:,i] )for i in range(len(eigvalues))]


eigpairs.sort(key=lambda x:x[0],reverse=True)
   
tot=sum(eigvalues)
var_exp=[(i/tot)*100 for i in sorted(eigvalues,reverse=True)]
cum_var_exp=np.cumsum(var_exp)
In [9]:
#shows how many PCAs are good to use to explain data in lower dimension
trace1 = go.Scatter(
    x=list(range(784)),
    y= cum_var_exp,
    mode='lines+markers',
    name="'Cumulative Explained Variance'",
   
    line = dict(
        shape='spline',
        color = 'goldenrod'
    )
)
trace2 = go.Scatter(
    x=list(range(784)),
    y= var_exp,
    mode='lines+markers',
    name="'Individual Explained Variance'",
 
     line = dict(
        shape='linear',
        color = 'black'
    )
)
fig = tls.make_subplots(insets=[{'cell': (1,1), 'l': 0.7, 'b': 0.5}],
                          print_grid=True)

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,1)


fig.layout.title='explained Variance plots'
fig.layout.xaxis=dict(range=[0,800],title='Feature columns')
fig.layout.yaxis=dict(range=[0,100],title='explained variance')


py.iplot(fig,filename='inset example')
This is the format of your plot grid:
[ (1,1) x,y ]

With insets:
[ x2,y2 ] over [ (1,1) x,y ]

In [10]:
#make 30 PCAs
pca=PCA(30)
pca.fit(X_std)
Out[10]:
PCA(n_components=30)
In [11]:
X_pca=pca.transform(X_std)
In [12]:
X_pca.shape
Out[12]:
(42000, 30)
In [13]:
X_std.shape
Out[13]:
(42000, 784)
In [14]:
eigenvectors=pca.components_
eigenvectors.shape
Out[14]:
(30, 784)
In [15]:
plt.figure(figsize=(17,16))

x_row=4
y_col=7

for i in list(range(x_row*y_col)):
    
    plt.subplot(x_row,y_col,i+1)
    plt.imshow(eigenvectors[i].reshape(28,28),cmap='twilight_shifted')
    title_='Eigenvector'+str(i+1)
    plt.title(title_)
    plt.xticks(())
    plt.yticks(())
plt.show()
In [16]:
plt.figure(figsize=(12,13))

for i in list(range(0,70)):
    plt.subplot(7,10,i+1)
    plt.title(target[i])
    plt.imshow(training_df.iloc[i].values.reshape(28,28), interpolation = "none", cmap='binary')
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()
               
plt.tight_layout
Out[16]:
<function matplotlib.pyplot.tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None)>
In [17]:
#140 PCS explain 80% of data
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(140)
X_140d=pca_.fit_transform(X_std_)
Target=target
In [18]:
trace = go.Scatter(
    x = X_140d[:,0],
    y = X_140d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
In [19]:
#now see how good clustering does using PCs
kmeans=KMeans(10)
X_clustered140=kmeans.fit_predict(X_140d)
In [20]:
tracekmeans = go.Scatter(x=X_140d[:, 0], y= X_140d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered140,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )

data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
In [21]:
x_clusters_df=pd.DataFrame(X_clustered140, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
In [22]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
Out[22]:
Cluster 0 1 2 3 4 5 6 7 8 9
label
0 66 5 441 206 17 2319 793 278 4 3
1 9 1 96 10 4546 0 13 5 1 3
2 1347 99 64 606 489 19 478 1039 21 15
3 434 109 87 67 363 7 2486 682 69 47
4 52 2407 592 77 330 32 2 10 175 395
5 154 166 1602 65 299 21 1304 98 33 53
6 228 18 105 3206 328 176 63 8 4 1
7 8 921 87 3 318 12 8 12 569 2463
8 113 202 1265 17 697 23 1575 44 38 89
9 20 2078 91 3 223 27 80 7 232 1427
In [23]:
#use three metrics
homogeneity_score(Target, X_clustered140)
Out[23]:
0.4177827208776064
In [24]:
metrics.silhouette_score(X_140d, X_clustered140)
Out[24]:
0.032905805821601755
In [25]:
metrics.completeness_score(Target, X_clustered140)
Out[25]:
0.44067042882684243
In [26]:
#do same thing with 319 PCs and 784 PCs
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(319)
X_319d=pca_.fit_transform(X_std_)
Target=target
In [27]:
trace = go.Scatter(
    x = X_319d[:,0],
    y = X_319d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
In [28]:
kmeans=KMeans(10)
X_clustered319=kmeans.fit_predict(X_319d)
In [29]:
tracekmeans = go.Scatter(x=X_319d[:, 0], y= X_319d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered319,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
In [30]:
x_clusters_df=pd.DataFrame(X_clustered319, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
In [31]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
Out[31]:
Cluster 0 1 2 3 4 5 6 7 8 9
label
0 3 355 60 212 2246 393 4 5 837 17
1 4 97 9 10 0 4 0 1 12 4547
2 15 47 1387 579 18 1055 24 104 459 489
3 43 85 437 66 6 691 71 117 2475 360
4 363 629 47 78 33 13 191 2391 2 325
5 52 1588 150 66 18 110 38 165 1315 293
6 1 89 230 3208 184 11 4 17 65 328
7 2458 98 8 3 11 11 550 936 9 317
8 85 1252 113 17 22 51 39 201 1586 697
9 1409 103 16 3 25 8 249 2079 81 215
In [32]:
homogeneity_score(Target, X_clustered319)
Out[32]:
0.4176052762095053
In [33]:
metrics.silhouette_score(X_319d, X_clustered319)
Out[33]:
0.013955314779369354
In [34]:
metrics.completeness_score(Target, X_clustered319)
Out[34]:
0.4399566862145544
In [35]:
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(784)
X_784d=pca_.fit_transform(X_std_)
Target=target
In [36]:
trace = go.Scatter(
    x = X_784d[:,0],
    y = X_784d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
In [37]:
kmeans=KMeans(10)
X_clustered784=kmeans.fit_predict(X_784d)
In [38]:
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered784,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
In [39]:
x_clusters_df=pd.DataFrame(X_clustered784, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
In [40]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
Out[40]:
Cluster 0 1 2 3 4 5 6 7 8 9
label
0 5 17 197 60 5 282 2310 445 808 3
1 1 4551 10 8 0 5 0 92 14 3
2 29 488 603 1351 95 1044 19 62 470 16
3 76 358 67 436 116 688 6 87 2473 44
4 239 318 82 38 2394 11 37 621 3 329
5 45 296 64 146 171 99 20 1597 1310 47
6 4 325 3207 223 17 8 184 105 63 1
7 533 305 3 8 1001 12 12 87 8 2432
8 52 697 17 105 203 45 23 1257 1582 82
9 304 213 3 14 2084 7 25 96 81 1361
In [41]:
homogeneity_score(Target, X_clustered784)
Out[41]:
0.4180014684767763
In [42]:
metrics.silhouette_score(X_784d, X_clustered784)
Out[42]:
0.006940835813069593
In [43]:
metrics.completeness_score(Target, X_clustered784)
Out[43]:
0.4401191075686792
In [44]:
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered784,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
In [57]:
#DO KMEANS WITHOUT PCS using direct 784 features
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(training_df)
In [59]:
x_clusters_df=pd.DataFrame(X_clustered, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
In [60]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
Out[60]:
Cluster 0 1 2 3 4 5 6 7 8 9
label
0 24 9 138 125 1985 11 1 9 1715 115
1 6 3 4 5 0 2021 2633 4 0 8
2 126 2912 232 142 9 289 236 48 71 112
3 123 149 2797 43 11 68 285 34 89 752
4 2282 27 0 130 10 216 94 1292 8 13
5 242 12 1268 73 49 565 111 250 173 1052
6 55 49 19 3415 84 164 189 2 98 62
7 1313 31 4 5 9 207 212 2604 9 7
8 127 33 768 35 26 288 176 126 26 2458
9 2117 6 51 12 32 88 146 1668 17 51
In [61]:
homogeneity_score(Target, X_clustered)
Out[61]:
0.4862481675417057
In [62]:
metrics.silhouette_score(X_, X_clustered)
Out[62]:
0.05856046237802117
In [63]:
metrics.completeness_score(Target, X_clustered)
Out[63]:
0.4977077376390712